In [53]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
from PIL import Image
from keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, ReLU, MaxPool2D, GlobalAvgPool2D
from tensorflow.keras.layers import Input, Add, ZeroPadding2D, Activation, BatchNormalization, Flatten, Conv2D, AveragePooling2D, GlobalMaxPooling2D, GlobalAveragePooling2D
from tensorflow.keras.initializers import glorot_uniform
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
Data Preprocessing¶
In [54]:
supp_dir = '/kaggle/input/ubc-ovarian-cancer-competition-supplemental-masks'
data_dir = '/kaggle/input/UBC-OCEAN'
train_csv = pd.read_csv(data_dir + '/train.csv')
test_csv = pd.read_csv(data_dir + '/test.csv')
In [55]:
#filter for WSI
train_csv = train_csv[train_csv['is_tma'] == False]
train_data, val_data = train_test_split(train_csv, test_size=0.2, random_state=42)
#image paths
train_image_paths = [data_dir + '/train_thumbnails/' + str(img_id) + '_thumbnail.png' for img_id in train_data['image_id']]
val_image_paths = [data_dir + '/train_thumbnails/' + str(img_id) + '_thumbnail.png' for img_id in val_data['image_id']]
test_image_paths = [data_dir + '/test_thumbnails/' + str(img_id) + '_thumbnail.png' for img_id in test_csv['image_id']]
#multi-class classification: encoding labels for model (one-hot encoding)
one_hot_encoder = OneHotEncoder(sparse_output=False)
# Reshape the labels to a 2D array before applying OneHotEncoder
train_labels = np.array(train_data['label'])
val_labels = np.array(val_data['label'])
train_labels_reshaped = train_labels.reshape(-1, 1)
val_labels_reshaped = val_labels.reshape(-1, 1)
train_labels_one_hot = one_hot_encoder.fit_transform(train_labels_reshaped)
val_labels_one_hot = one_hot_encoder.transform(val_labels_reshaped)
#print(test_labels_one_hot)
#print(val_labels_one_hot)
In [56]:
#(1) feature scales first (all pixel values are now between 0 and 1), image augmentation transforms (shear_range, zoom_range, horizontal_flip) to prevent overfitting
datagen = ImageDataGenerator(rescale = 1./255,
shear_range = 0.2,
#zoom_range = 0.2,
horizontal_flip = True
)
def load_and_augment_img(img_path):
img = Image.open(img_path)
img = img.resize((224, 224)) # Resize to desired dimensions
img = np.array(img) # Convert to numpy array
img = img.reshape((1,) + img.shape) # Reshape to (1, height, width, channels) for flow()
img = datagen.flow(img, batch_size=1).next() # Apply data augmentation
return img[0]
# Apply data augmentation to training, validation, and test images
train_images_augmented = [load_and_augment_img(path) for path in train_image_paths]
val_images_augmented = [load_and_augment_img(path) for path in val_image_paths]
test_images_augmented = [load_and_augment_img(path) for path in test_image_paths]
In [57]:
import matplotlib.pyplot as plt
def visualize(image):
plt.figure(figsize=(10, 10))
plt.axis('off')
plt.imshow(image)
In [58]:
from IPython.display import Image
Image(filename=train_image_paths[0], width=768, height=768)
Out[58]: